library(class)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(naniar)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
#Hello and welcome to Case Study 01, We are here to present findings from the Beers and Breweries datasets pulled from the company repository. We pulled these datasets into RStudio and conducted an extensive exploratory data analysis(EDA) for your review. We expect that Budweiser could benefit from our findings as you seek to get a greater understanding of the landscape of Breweries and Beer production in the US. For context, we looked at breweries and beers across the country by state and style with a specific focus on key beer characteristics including Alcohol By Volume and International Bitterness Unit.
#Read the Beers and Breweries data files in csv format
Beers = read.csv("/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit8/Beers.csv")
Breweries = read.csv("/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit8/Breweries.csv")
#Here are some of the questions that we asked as part of the EDA
#How many Breweries are present in each state #Group Breweries data by State #Plot a Bar Graph
Breweries %>% group_by(State) %>% dplyr::summarize(Breweries = n()) %>% print(n=51)
## # A tibble: 51 × 2
## State Breweries
## <chr> <int>
## 1 " AK" 7
## 2 " AL" 3
## 3 " AR" 2
## 4 " AZ" 11
## 5 " CA" 39
## 6 " CO" 47
## 7 " CT" 8
## 8 " DC" 1
## 9 " DE" 2
## 10 " FL" 15
## 11 " GA" 7
## 12 " HI" 4
## 13 " IA" 5
## 14 " ID" 5
## 15 " IL" 18
## 16 " IN" 22
## 17 " KS" 3
## 18 " KY" 4
## 19 " LA" 5
## 20 " MA" 23
## 21 " MD" 7
## 22 " ME" 9
## 23 " MI" 32
## 24 " MN" 12
## 25 " MO" 9
## 26 " MS" 2
## 27 " MT" 9
## 28 " NC" 19
## 29 " ND" 1
## 30 " NE" 5
## 31 " NH" 3
## 32 " NJ" 3
## 33 " NM" 4
## 34 " NV" 2
## 35 " NY" 16
## 36 " OH" 15
## 37 " OK" 6
## 38 " OR" 29
## 39 " PA" 25
## 40 " RI" 5
## 41 " SC" 4
## 42 " SD" 1
## 43 " TN" 3
## 44 " TX" 28
## 45 " UT" 4
## 46 " VA" 16
## 47 " VT" 10
## 48 " WA" 23
## 49 " WI" 20
## 50 " WV" 1
## 51 " WY" 4
#Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries")
Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries") + geom_text(stat = "count", aes(label = after_stat(count)), vjust = 0) + theme(legend.position = "none") + xlab("State")
#visualization for the above question, as a heat map on the USA map
#install.packages("usmap")
library(usmap)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
StateBeerC = data.frame(state = c("AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"),values = c(7,3,2,11,39,47,8,1,2,15,7,4,5,5,18,22,3,4,5,23,7,9,32,12,9,2,9,19,1,5,3,3,4,2,16,15,6,29,25,5,4,1,3,28,4,16,10,23,20,1,4))
p <- plot_usmap(data = StateBeerC, values = "values", regions = "state") + scale_fill_continuous(low = "yellow", high = "red", name = "Number of Breweries", label = scales::comma) + labs(title = "Number of Breweries By State", ) + theme(legend.position = "right")
ggplotly(p)
#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the #merged file.
Breweries$Brewery_id = Breweries$Brew_ID
Breweries <- Breweries %>% select(Brewery_id, Name, City, State)
BB <- merge(Beers,Breweries, by = "Brewery_id", all = TRUE)
dim(BB)
## [1] 2410 10
summary(BB)
## Brewery_id Name.x Beer_ID ABV
## Min. : 1.0 Length:2410 Min. : 1.0 Min. :0.00100
## 1st Qu.: 94.0 Class :character 1st Qu.: 808.2 1st Qu.:0.05000
## Median :206.0 Mode :character Median :1453.5 Median :0.05600
## Mean :232.7 Mean :1431.1 Mean :0.05977
## 3rd Qu.:367.0 3rd Qu.:2075.8 3rd Qu.:0.06700
## Max. :558.0 Max. :2692.0 Max. :0.12800
## NA's :62
## IBU Style Ounces Name.y
## Min. : 4.00 Length:2410 Min. : 8.40 Length:2410
## 1st Qu.: 21.00 Class :character 1st Qu.:12.00 Class :character
## Median : 35.00 Mode :character Median :12.00 Mode :character
## Mean : 42.71 Mean :13.59
## 3rd Qu.: 64.00 3rd Qu.:16.00
## Max. :138.00 Max. :32.00
## NA's :1005
## City State
## Length:2410 Length:2410
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
head(BB)
## Brewery_id Name.x Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Name.y City
## 1 American IPA 16 NorthGate Brewing Minneapolis
## 2 Milk / Sweet Stout 16 NorthGate Brewing Minneapolis
## 3 English Brown Ale 16 NorthGate Brewing Minneapolis
## 4 Pumpkin Ale 16 NorthGate Brewing Minneapolis
## 5 American Porter 16 NorthGate Brewing Minneapolis
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing Minneapolis
## State
## 1 MN
## 2 MN
## 3 MN
## 4 MN
## 5 MN
## 6 MN
tail(BB)
## Brewery_id Name.x Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Name.y City
## 2405 German Pilsener 12 Ukiah Brewing Company Ukiah
## 2406 Hefeweizen 12 Butternuts Beer and Ale Garrattsville
## 2407 American IPA 12 Butternuts Beer and Ale Garrattsville
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale Garrattsville
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company Anchorage
## State
## 2405 CA
## 2406 NY
## 2407 NY
## 2408 NY
## 2409 NY
## 2410 AK
#Address the missing values in each column #https://www.masterclass.com/articles/ibu-beer #We read there are styles of beer and each style has a range of IBU, we could use the sytle of the beer to assign a #value #After doing the missing value analysis to figure out if its a MCAR/MAR/MNAR, we felt the values are missing completely at Random, they are not missing based on another variable or missing because of themselves. We did some reading online for ABV and IBU values and learnt that these values are a range based on the style of the Beer, so we used mean ABV and IBU by Style, to fill in the missing data and merged all the data points
gg_miss_var(BB) + ggtitle("Missing Values in Dataset")
MeanIBU <- BB %>% filter(!is.na(IBU)) %>% group_by(Style) %>% dplyr::summarize(IBUMean = mean(IBU))
BB <- merge(BB,MeanIBU, by="Style")
BB$IBU[is.na(BB$IBU)] <- BB$IBUMean[is.na(BB$IBU)]
MeanABV <- BB %>% filter(!is.na(ABV)) %>% group_by(Style) %>% dplyr::summarize(ABVMean=mean(ABV))
BB <- merge(BB,MeanABV, by="Style")
BB$ABV[is.na(BB$ABV)] <- BB$ABVMean[is.na(BB$ABV)]
#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.
BB %>% ggplot(aes(x = State, y = IBU, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median IBU of Breweries by State") + ylab("International Bitterness Unit") + theme(legend.position = "none")
BB %>% ggplot(aes(x = State, y = ABV, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median ABV of Breweries by State") + ylab("Alcohol By Volume") + theme(legend.position = "none")
#Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer? #Colorado has the maximum alcoholic beer and Oregon has the most bitter beeR #Finding States with Maximum IBU and ABV values
BB[which.max(BB$ABV),]
## Style Brewery_id
## 2130 Quadrupel (Quad) 52
## Name.x Beer_ID ABV IBU
## 2130 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale 2565 0.128 24
## Ounces Name.y City State IBUMean ABVMean
## 2130 19.2 Upslope Brewing Company Boulder CO 24 0.104
BB[which.max(BB$IBU),]
## Style Brewery_id Name.x Beer_ID
## 429 American Double / Imperial IPA 375 Bitter Bitch Imperial IPA 980
## ABV IBU Ounces Name.y City State IBUMean ABVMean
## 429 0.082 138 12 Astoria Brewing Company Astoria OR 93.32 0.08736893
#Comment on the summary statistics and distribution of the ABV variable. #Outliers: There are few outliers in the ABV data. #Skewness: ABV values have a right-skew distribution #Interquartile Range: 50th percentile of ABV values fall between 0.050 (Q1) and 0.067 (Q3). # (IQR = Q3 – Q1 = 0.017) #Range: The measure of spread is large.
summary(BB$ABV)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02700 0.05000 0.05600 0.05972 0.06700 0.12800
sd(BB$ABV)
## [1] 0.0134303
#Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer. #I do not think that there is enough evidence to say that there is a linear relationship between ABV and IBU values. Looking at the 50th percentile of data, there is a linear relationship, but outside of that range there is no clear linear relationship and I am wondering if this is because of certain syles of beers
BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point(color = "red") +geom_smooth(position="jitter") + geom_smooth() + geom_smooth(method = lm) + ggtitle("Distribution of IBU vs ABV")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'
#Trying another way of smoothing the curve, it does appear that there is
a linear relationship between ABV #and IBU values
BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(method=lm) + ggtitle("Distribution of IBU vs ABV")
## `geom_smooth()` using formula = 'y ~ x'
#Investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of #Ale (any beer with “Ale” in its name other than IPA) #Create a dataset for the two syles of Beers (IPA and Ale) #Plot the relationship between ABV and IBU for IPA and Ales. From the plot we can tell that IPAs have highter values of ABU and IBU.
BBIPAAle <- BB %>% filter(str_detect(Style, "IPA")|str_detect(Style, " Ale"))
BBIPAAle$AleType <- ifelse(str_detect(BBIPAAle$Style,"IPA"),"IPA","Ale")
BBIPAAle %>% ggplot(aes(x=ABV, y = IBU, color = AleType)) + geom_point() +ggtitle("Distribution of IBU vs ABV by Style (IPA vs Ale)")
#We would like to investigate the relationship further using the KNN
Classification model. Below we re using the internal classification to
find the accuracy of our predictions and we see that we have 91%
accuracy using KNN internal classification model with a K value of 5
classifications = knn.cv(BBIPAAle[,c(5,6)],BBIPAAle$AleType, prob = TRUE, k = 5)
table(classifications,BBIPAAle$AleType)
##
## classifications Ale IPA
## Ale 892 67
## IPA 70 504
CM = confusionMatrix(table(classifications,BBIPAAle$AleType))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 892 67
## IPA 70 504
##
## Accuracy : 0.9106
## 95% CI : (0.8952, 0.9244)
## No Information Rate : 0.6275
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.809
##
## Mcnemar's Test P-Value : 0.8643
##
## Sensitivity : 0.9272
## Specificity : 0.8827
## Pos Pred Value : 0.9301
## Neg Pred Value : 0.8780
## Prevalence : 0.6275
## Detection Rate : 0.5819
## Detection Prevalence : 0.6256
## Balanced Accuracy : 0.9049
##
## 'Positive' Class : Ale
##
#We would also like to investigate using the 70/30 Train/Test split. Here too, the accuracy is 89%
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
classifications = knn(train[,c(5,6)],test[,c(5,6)],train$AleType, prob = TRUE, k = 5)
table(classifications,test$AleType)
##
## classifications Ale IPA
## Ale 265 29
## IPA 17 149
CM = confusionMatrix(table(classifications,test$AleType))
CM
## Confusion Matrix and Statistics
##
##
## classifications Ale IPA
## Ale 265 29
## IPA 17 149
##
## Accuracy : 0.9
## 95% CI : (0.8689, 0.9259)
## No Information Rate : 0.613
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7866
##
## Mcnemar's Test P-Value : 0.1048
##
## Sensitivity : 0.9397
## Specificity : 0.8371
## Pos Pred Value : 0.9014
## Neg Pred Value : 0.8976
## Prevalence : 0.6130
## Detection Rate : 0.5761
## Detection Prevalence : 0.6391
## Balanced Accuracy : 0.8884
##
## 'Positive' Class : Ale
##
#Investigating using another classification model Naive Bayes with a 70/30 Train/Test split to compare with KNN classification and here we get an accuracy of 85%
splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##
## Ale IPA
## Ale 261 25
## IPA 24 150
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
##
##
## Ale IPA
## Ale 261 25
## IPA 24 150
##
## Accuracy : 0.8935
## 95% CI : (0.8616, 0.9201)
## No Information Rate : 0.6196
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7738
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9158
## Specificity : 0.8571
## Pos Pred Value : 0.9126
## Neg Pred Value : 0.8621
## Prevalence : 0.6196
## Detection Rate : 0.5674
## Detection Prevalence : 0.6217
## Balanced Accuracy : 0.8865
##
## 'Positive' Class : Ale
##
#Tuning NB for 100 iterations
# NB Loop for average of many training / test partition
iterations = 100
masterAcc = matrix(nrow = iterations, ncol = 3)
splitPerc = .7 #Training / Test split Percentage
for(j in 1:iterations)
{
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
masterAcc[j,1] = CM$overall[1]
masterAcc[j,2] = CM$byClass[1]
masterAcc[j,3] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8758696 0.8991731 0.8369258
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
for(i in 1:30)
{
classifications = knn(train[,c(5,6)],test[,c(5,6)],train$AleType, prob = TRUE, k = i)
table(test$AleType,classifications)
CM = confusionMatrix(table(test$AleType,classifications))
accs$accuracy[i] = CM$overall[1]
accs$k[i] = i
}
plot(accs$k,accs$accuracy, type = "l", xlab = "k")
which.max(accs$accuracy)
## [1] 1
#Tuning KNN for 100 iterations and 30 values of K
iterations = 100
numks = 30
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
for(i in 1:numks)
{
classifications = knn(train[,c(5,6)],test[,c(5,6)],train$AleType, prob = TRUE, k = i)
table(classifications,test$AleType)
CM = confusionMatrix(table(classifications,test$AleType))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
which.max(MeanAcc)
## [1] 5
mean(MeanAcc)
## [1] 0.8843667
plot(seq(1,numks,1),MeanAcc, type = "l",xlab = "K")
# With all the methods we can clearly see that Ale and IPA have a clear
distingsion with respect to ABB and IBU values #Classifying Beers into 7
broad Categories Ale, IPA, Stout, Pilsner, Beer, Lager and Other #Since
Budweiser beers are of Lager, comparing Lagers against IPA and Ales
#Lager is a smaller section of Beers when compared to Ale and IPAs.
Budweiser could look into capturing the market of IPAs an Ales
BBClassify <- BB
BBClassify$BeerType = ifelse(str_detect(BBClassify$Style, "IPA"),"IPA",ifelse(str_detect(BBClassify$Style, "Stout"), "Stout",ifelse(str_detect(BBClassify$Style, "Pilsner"),"Pilsner",ifelse(str_detect(BBClassify$Style, "Beer"),"Beer",ifelse(str_detect(BBClassify$Style, " Ale"),"Ale",ifelse(str_detect(BBClassify$Style, "Lager"),"Lager","Other"))))))
BBClassify %>% ggplot(aes(x=BeerType, fill= BeerType)) + geom_bar() + ggtitle("Distribution of Beers by Beer Type") + ylab("Beers") + xlab("Beer Type")
BBClassify %>% filter(BeerType == "IPA" |BeerType == "Ale" |BeerType == "Lager") %>% ggplot(aes(x=ABV, y = IBU, color = BeerType)) + geom_jitter() + ggtitle("Distribution of IBU vs ABV by Style (IPA vs Ale vs Lager)")